
###################################################################3
####################  NATURAL HARBORS       ########################
####################################################################
#DATA AND LIBRARIES
#coastlines
library(haven)
squiggly <- read_dta("data/predictports.dta")
# set memory if on windows
if(.Platform$OS.type == "unix") {
} else {
  memory.limit(400000)
}

#naturalharbors
harbors <- read_dta("data/priogrid2wpi.dta")

harbors <- subset(harbors, wpi53natportsdum==1)


harbors$port_nat <- 1

#merging
data <- merge(squiggly, harbors, by=c("gid"), all.x=T)

#gwno
gwno <- read.csv("data/Pgrid.csv")
colnames(gwno)[1] <- "gid"
gwno <- subset(gwno, year==2012)

data <- merge(data, gwno, by="gid")

#creating region variables
data$region[data$gwno>2 & data$gwno<100] <- "NamCentr"
data$region[data$gwno>99 & data$gwno<166] <- "Latam"
data$region[data$gwno>166 & data$gwno<396] <- "Eur"
data$region[data$gwno>401 & data$gwno<591] <- "SSA"
data$region[data$gwno>590 & data$gwno<699] <- "Mena"
data$region[data$gwno>401 & data$gwno<591] <- "SSA"
data$region[data$gwno>698 & data$gwno<860] <- "Asia"
data$region[data$gwno>859] <- "Oceania"

#port variable
data$port_nat[is.na(data$port_nat)==T] <- 0

#packages
library(ISLR)
library(gam)
library(separationplot)
library(verification)
library(caTools)
library(pROC)
library(ROCR)
library(glmnet)

table(data$region)
data$region[is.na(data$region)==T] <- "Miss"
################################################
##########       EXPLORATORY    ################
################################################

#simple linear, polynomial and with regions
summary(lm(port_nat ~ nodeweight, data=data))
summary(lm(port_nat ~ poly(nodeweight, 5), data=data)) #the polynomial predicts better
summary(lm(port_nat ~ poly(nodeweight, 5) + as.factor(region) -1 , data=data))
       #improvement when we include region (to 1% r-squared which is still low)

#with logged variable linear, polynomial and with regions
data$lnodeweight <- log(data$nodeweight+0.01)
summary(lm(port_nat ~ lnodeweight, data=data)) #R squared=.001
summary(lm(port_nat ~ poly(lnodeweight, 5), data=data)) #R squared = .015 
summary(lm(port_nat ~ poly(lnodeweight, 8), data=data)) #R squared = .017 the polynomial predicts better
summary(lm(port_nat ~ poly(lnodeweight, 5) + as.factor(region)-1, data=data)) #R squared = .03 #improvement when we include region
summary(lm(port_nat ~ poly(lnodeweight, 4)*as.factor(region)-1, data=data)) #R squared = .0412 #improvement when we include region



#clipping out observations with length below certain thresholds
#what happens when we exclude those with very low length

#20
data$nodeweight2 <- data$nodeweight
data$nodeweight2[data$length<20] <- 0
data$lnodeweight2 <- log(data$nodeweight2+0.01)

summary(lm(port_nat ~ lnodeweight2, data=data))
summary(lm(port_nat ~ poly(lnodeweight2, 5), data=data)) #R-squared=.039 #much better fit
summary(lm(port_nat ~ poly(lnodeweight2, 5) + as.factor(region)-1, data=data)) #R-squared=.2152 
summary(lm(port_nat ~ poly(lnodeweight2, 5)* as.factor(region)-1, data=data)) #R-squared=.2297 

#70
data$nodeweight3 <- data$nodeweight2
data$nodeweight3[data$length<70] <- 0
data$lnodeweight3 <- log(data$nodeweight3+0.01)

#slightly better
summary(lm(port_nat ~ lnodeweight3, data=data))
summary(lm(port_nat ~ poly(lnodeweight3, 5), data=data)) #much better fit! 
summary(lm(port_nat ~ poly(lnodeweight3, 4), data=data))  #R-squared=.087 #much better fit #can we reduce the splines and improve?
summary(lm(port_nat ~ poly(lnodeweight3, 4)* as.factor(region)-1, data=data))  #R-squared=.2649 #much better fit #can we reduce the splines and improve?
summary(lm(port_nat ~ poly(lnodeweight3, 4) + as.factor(region)-1, data=data))  #R-squared=.2513 #much better fit #can we reduce the splines and improve?

summary(lm(port_nat ~ poly(lnodeweight3, 4) + as.factor(region)-1, data=data))  #R-squared=.2513 #much better fit #can we reduce the splines and improve?

summary(lm(port_nat ~ poly(lnodeweight3, 4)* as.factor(region)-1 + length + nodes, data=data))  #R-squared=.264
summary(lm(port_nat ~ poly(lnodeweight3, 4) + as.factor(region)-1 + length + nodes, data=data))  #R-squared=.264




#the winners are one with lnodeweight3 4 splines , the same one + region, the same one * region, the same one * region + length and nodes

#three models
summary(mod1 <- lm(port_nat ~ poly(lnodeweight3, 4) + length + nodes, data=data))
data$predmod1 <- mod1$fitted.values

summary(mod2 <- lm(port_nat ~ poly(lnodeweight3, 4)* as.factor(region)-1 + length + nodes, data=data)) 
data$predmod2 <- mod2$fitted.values

summary(mod3 <- lm(port_nat ~ poly(lnodeweight3, 4) + as.factor(region)-1 + length + nodes, data=data)) #do nodes improve the fit? YES
data$predmod3 <- mod3$fitted.values

#Creating different prediction thresholds (different predicted values)

#predmod1
data$one_port_nat_pred0 <- ifelse(data$predmod1>.1,1,0)
data$one_port_nat_pred1 <- ifelse(data$predmod1>.2,1,0)
data$one_port_nat_pred2 <- ifelse(data$predmod1>.3,1,0)
data$one_port_nat_pred3 <- ifelse(data$predmod1>.4,1,0)

#predmod2
data$two_port_nat_pred0 <- ifelse(data$predmod2>.1,1,0)
data$two_port_nat_pred1 <- ifelse(data$predmod2>.2,1,0)
data$two_port_nat_pred2 <- ifelse(data$predmod2>.3,1,0)
data$two_port_nat_pred3 <- ifelse(data$predmod2>.4,1,0)

#predmod3
data$three_port_nat_pred0 <- ifelse(data$predmod3>.1,1,0)
data$three_port_nat_pred1 <- ifelse(data$predmod3>.2,1,0)
data$three_port_nat_pred2 <- ifelse(data$predmod3>.3,1,0)
data$three_port_nat_pred3 <- ifelse(data$predmod3>.4,1,0)


#binary prediction - lowest threshold
summary(lm(port_nat~ one_port_nat_pred0, data=data)) #winner
summary(lm(port_nat~ two_port_nat_pred0, data=data))
summary(lm(port_nat~ three_port_nat_pred0, data=data))

#second threshold
summary(lm(port_nat~ one_port_nat_pred1, data=data)) 
summary(lm(port_nat~ two_port_nat_pred1, data=data)) #winner
summary(lm(port_nat~ three_port_nat_pred1, data=data)) 

#third threshold
summary(lm(port_nat~ one_port_nat_pred2, data=data)) 
summary(lm(port_nat~ two_port_nat_pred2, data=data)) #winner
summary(lm(port_nat~ three_port_nat_pred2, data=data)) 

#final threshold
summary(lm(port_nat~ one_port_nat_pred3, data=data))  #winner
summary(lm(port_nat~ two_port_nat_pred3, data=data))
summary(lm(port_nat~ three_port_nat_pred3, data=data)) 


#########################################################
###############       AUROC AND PR   ####################
#########################################################
#lets see which of these produces the best AUC

g1p1 <- glm(port_nat ~ two_port_nat_pred1, data=data, family=binomial(link="logit"))
g2p1 <- glm(port_nat ~ two_port_nat_pred2, data=data, family=binomial(link="logit"))

#predictions
preds1 <- predict.glm(g1p1, data,type="response")
preds2 <-  predict.glm(g2p1, data,type="response")

#rocs
roc1 <- prediction(preds1, data$port_nat)
roc2 <-  prediction(preds2, data$port_nat)

#extracting concrete predictions from the rocopbject
## x <- as.data.frame(unlist(roc0@tn))
## x2 <- as.data.frame(unlist(roc0@cutoffs))
## predictions <- cbind(x,x2)
## colnames(predictions) <- c("tps", "cutoffs")
## predictions$tps[predictions$cutoffs>.50]
## data$baselinepredictions <- preds0 #inserting predictions (false negatives into mydata)

#getting the AUROC
auc01 <- performance(roc1,"auc") #getting the AUROC #winner
auc02 <- performance(roc2,"auc") #getting the AUROC

#The most successful predictor of natural port_nats is from a model with
#logged nodelength clipped at 70, nodelength with 4 polynomials, and length and nodes as covariates.
#Regions dont improve the fit

#The best binary prediction model is when we set the predicted values from this mdoel above .1 and classify
table(data$two_port_nat_pred1, data$port_nat) #58 percent of port_nats are in these cells #34% percent of these cells contain ports..
table(data$two_port_nat_pred2, data$port_nat) #39% of port nats are in these cells #42% contain ports

table(data$one_port_nat_pred0, data$port_nat) #80


#which are left? port_nat_pred0, port_nat_pred1
names(data)
data <- subset(data, select=c(gid,
                              length,
                              nodes,
                              nodeweight,
                              region,
                              lnodeweight,
                              nodeweight2,
                              nodeweight3,
                              lnodeweight2,
                              lnodeweight3,
                              two_port_nat_pred1,
                              two_port_nat_pred2 ))

data$natport_53_pred <- data$two_port_nat_pred1

###################################################################3
####################  ALL PORTS      ########################
####################################################################

#DATA AND LIBRARIES

#naturalharbors
harbors <- read_dta("data/priogrid2wpi.dta")

harbors <- subset(harbors, wpi53portsdum==1)


harbors$port_all <- 1


#merging
data <- merge(data, harbors, by=c("gid"), all.x=T)


#port variable
data$port_all[is.na(data$port_all)==T] <- 0
################################################
##########       EXPLORATORY    ################
################################################

#simple linear, polynomial and with regions
summary(lm(port_all ~ nodeweight, data=data))
summary(lm(port_all ~ poly(nodeweight, 5), data=data)) #the polynomial predicts better
summary(lm(port_all ~ poly(nodeweight, 5) + as.factor(region), data=data)) #improvement when we include region

#with logged variable linear, polynomial and with regions
data$lnodeweight <- log(data$nodeweight+0.01)
summary(lm(port_all ~ lnodeweight, data=data))
summary(lm(port_all ~ poly(lnodeweight, 5), data=data)) #the polynomial predicts better
summary(lm(port_all ~ poly(lnodeweight, 5) + as.factor(region), data=data)) #improvement when we include region

#clipping out observations with length below certain thresholds
#what happens when we exclude those with very low length

#20
data$nodeweight2 <- data$nodeweight
data$nodeweight2[data$length<20] <- 0
data$lnodeweight2 <- log(data$nodeweight2+0.01)

summary(lm(port_all ~ lnodeweight2, data=data))
summary(lm(port_all ~ poly(lnodeweight2, 5), data=data)) #much better fit


#70
data$nodeweight3 <- data$nodeweight2
data$nodeweight3[data$length<70] <- 0
data$lnodeweight3 <- log(data$nodeweight3+0.01)

#slightly better
summary(lm(port_all ~ lnodeweight3, data=data))
summary(lm(port_all ~ poly(lnodeweight3, 5), data=data)) #much better fit! 

summary(lm(port_all ~ poly(lnodeweight3, 4), data=data)) #can we reduce the splines and improve?
summary(lm(port_all ~ poly(lnodeweight3, 4) + as.factor(region)-1, data=data)) #can we reduce the splines and improve?
summary(lm(port_all ~ poly(lnodeweight3, 4)*as.factor(region)-1, data=data)) #can we reduce the splines and improve?


#three models
summary(mod1 <- lm(port_all ~ poly(lnodeweight3, 4), data=data))
data$predmod1 <- mod1$fitted.values

summary(mod2 <- lm(port_all ~ poly(lnodeweight3, 4) + as.factor(region)-1, data=data)) #does length improve the fit? YES
data$predmod2 <- mod2$fitted.values

summary(mod3 <- lm(port_all ~ poly(lnodeweight3, 4)*as.factor(region)-1, data=data)) #do nodes improve the fit? YES
data$predmod3 <- mod3$fitted.values

#Creating different prediction thresholds (different predicted values)

#predmod1
data$one_port_all_pred0 <- ifelse(data$predmod1>.1,1,0)
data$one_port_all_pred1 <- ifelse(data$predmod1>.2,1,0)
data$one_port_all_pred2 <- ifelse(data$predmod1>.3,1,0)
data$one_port_all_pred3 <- ifelse(data$predmod1>.4,1,0)

#predmod2
data$two_port_all_pred0 <- ifelse(data$predmod2>.1,1,0)
data$two_port_all_pred1 <- ifelse(data$predmod2>.2,1,0)
data$two_port_all_pred2 <- ifelse(data$predmod2>.3,1,0)
data$two_port_all_pred3 <- ifelse(data$predmod2>.4,1,0)

#predmod3
data$three_port_all_pred0 <- ifelse(data$predmod3>.1,1,0)
data$three_port_all_pred1 <- ifelse(data$predmod3>.2,1,0)
data$three_port_all_pred2 <- ifelse(data$predmod3>.3,1,0)
data$three_port_all_pred3 <- ifelse(data$predmod3>.4,1,0)


#binary prediction - lowest threshold
summary(lm(port_all~ one_port_all_pred0, data=data)) 
summary(lm(port_all~ two_port_all_pred0, data=data))
summary(lm(port_all~ three_port_all_pred0, data=data))

#second threshold
summary(lm(port_all~ one_port_all_pred1, data=data))  #winner
summary(lm(port_all~ two_port_all_pred1, data=data))
summary(lm(port_all~ three_port_all_pred1, data=data)) 

#third threshold
summary(lm(port_all~ one_port_all_pred2, data=data)) 
summary(lm(port_all~ two_port_all_pred2, data=data))
summary(lm(port_all~ three_port_all_pred2, data=data)) #credible

#final threshold
summary(lm(port_all~ one_port_all_pred3, data=data)) 
summary(lm(port_all~ two_port_all_pred3, data=data)) #winner
summary(lm(port_all~ three_port_all_pred3, data=data)) 


#natural port predictors
summary(lm(port_all~ two_port_nat_pred1, data=data)) 
summary(lm(port_all~ two_port_nat_pred2, data=data)) #winner


#########################################################
###############       AUROC AND PR   ####################
#########################################################
#lets see which of these produces the best AUC

#######0
g0p1 <- glm(port_all ~ one_port_all_pred2, data=data, family=binomial(link="logit"))
g1p1 <- glm(port_all ~ two_port_all_pred3, data=data, family=binomial(link="logit"))

#predictions
preds0 <- predict.glm(g0p1, data,type="response")
preds1 <- predict.glm(g1p1, data,type="response")


#rocs
roc0 <- prediction(preds0, data$port_all)
roc1 <- prediction(preds1, data$port_all)

#extracting concrete predictions from the rocopbject
x <- as.data.frame(unlist(roc0@tn))
x2 <- as.data.frame(unlist(roc0@cutoffs))
predictions <- cbind(x,x2)
colnames(predictions) <- c("tps", "cutoffs")
predictions$tps[predictions$cutoffs>.50]
data$baselinepredictions <- preds0 #inserting predictions (false negatives into mydata)

auc00 <- performance(roc0,"auc") #getting the AUROC
auc01 <- performance(roc1,"auc") #getting the AUROC #winner
auc02 <- performance(roc2,"auc") #getting the AUROC



#
table(data$one_port_all_pred2, data$port) #80 percent of ports are in these cells
table(data$two_port_all_pred3, data$port) #26% percent of these cells contain ports..


#final heat:

data$allport_53_pred <- data$one_port_all_pred2

data <- subset(data, select=c(gid, allport_53_pred, natport_53_pred))
#two_port_pred1 (best predictor of all ports based on all port data)
#three_port_nat_pred0, three_port_nat_pred1





